#!/usr/bin/env python
#
# Copyright 2013 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Scale hOCR output back to the original image resolution.
# Generously borrowed from:
# https://github.com/NCSU-Libraries/ocracoke/blob/c06c7b8c727b2d92b751921631d66e1ab3bfafd4/app/processing_helpers/hocr_resizer.rb

from __future__ import print_function
import argparse
import base64
import glob
import io
import os.path
import re
import sys
import zlib

from lxml import etree, html

def eprint(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)

def scale_hocr(data, percentage=50, savefile=False):
    """Scale the HOCR file with the given percentage factor"""
    if os.path.isdir(data):  
        files = sorted(glob.glob(os.path.join(data, '*.hocr')))
    elif os.path.isfile(data):  
        files = [ os.path.realpath(data) ]
    if len(files) == 0:
        print("WARNING: No HOCR files found at path ", data,
              "\nScript cannot proceed without them and will terminate now.\n")
        sys.exit(0)
    for hocr in files:
        _scale_hocr(hocr, percentage, savefile)

def _scale_hocr(hocrfile, percentage=50, savefile=False):
    """Scale the text bounding box for hOCR data"""
    pct = percentage / 100.0
    p1 = re.compile(r'bbox((\s+\d+){4})')
    p2 = re.compile(r'baseline((\s+[\d\.\-]+){2})')
    eprint("Reading from %s" % (hocrfile))
    hocr = etree.parse(hocrfile, html.XHTMLParser())
    for line in hocr.xpath('//*[@class]'):
        title = line.attrib['title']
        matched = p2.search(title)
        if matched:
            try:
                baseline = matched.group(1).split()
                orig = "baseline %s %s" % (baseline[0], baseline[1])
                mod = "baseline %s %s" % (float(baseline[0]), float(baseline[1]) * pct)
                scaled = title.replace(orig, mod)
                line.attrib['title'] = scaled
            except AttributeError:
                continue
        title = line.attrib['title']
        matched = p1.search(title)
        if matched:
            box = matched.group(1).split()
            orig = "bbox %s" % " ".join(box)
            box = ["{:.0f}".format(float(i) * pct) for i in box]
            bbox = "bbox %s" % " ".join(box)
            scaled = title.replace(orig, bbox)
            line.attrib['title'] = scaled
    scaled = etree.tostring(hocr)
    if savefile:
        with open(savefile, "w") as f:
            f.write(scaled)
    else:
        print(scaled.decode('utf-8'))

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Scale coordinates for a given hOCR file")
    parser.add_argument(
        "imgdir",
        help=("directory or path of hOCR files")
    )
    parser.add_argument('--percent', type=int, default=50, help='Scale percentage')
    parser.add_argument(
        "--savefile",
        help="Save to this file instead of outputting to stdout"
    )
    args = parser.parse_args()
    if args.percent == 0:
        # Cowardly refusing to scale by 0
        sys.exit(0)
    if not os.path.isdir(args.imgdir) and not os.path.isfile(args.imgdir):
        sys.exit("ERROR: Given path '" + args.imgdir + "' is not a file or directory")
    scale_hocr(args.imgdir, args.percent, args.savefile)
